Primary Vignette - Classification Strategies
Objectives
Perform exploratory data analysis
Reduce dimensionality using principal component analysis
Employ logistic regression using
glm()andmultinom()
We’ll illustrate these strategies using the California Household Travel Survey (CHTS) dataset.
# Loading necessary packages
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(tidymodels)── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom 1.0.7 ✔ rsample 1.2.1
✔ dials 1.3.0 ✔ tune 1.2.1
✔ infer 1.0.7 ✔ workflows 1.1.4
✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
✔ parsnip 1.2.1 ✔ yardstick 1.3.1
✔ recipes 1.1.0
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
library(sparsesvd)
library(nnet)
library(sf)Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(mapview)
mapviewOptions(fgb = FALSE)
library(leafsync)
library(maps)
Attaching package: 'maps'
The following object is masked from 'package:purrr':
map
# Read in datasets
PersonData <- read_rds('./Data/raw/PersonData_111A.Rds')
HHData <- read_rds('./Data/raw/HHData_111A.Rds')
hh_bgDensity <- read_rds('./Data/raw/hh_bgDensity.Rds')
county_shp <- st_read("./Data/raw/counties/counties.shp")Reading layer `counties' from data source
`/Users/valeriedelafuente/Documents/Capstone/vignette-group3/Data/raw/counties/counties.shp'
using driver `ESRI Shapefile'
Simple feature collection with 58 features and 17 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -124.3926 ymin: 32.53578 xmax: -114.1312 ymax: 42.00952
Geodetic CRS: WGS 84
# Merge datasets
personHHData <- left_join(PersonData, HHData) %>%
left_join(hh_bgDensity)Joining with `by = join_by(hhid)`
Joining with `by = join_by(hhid)`
# Preprocess data
numeric_columns <- sapply(personHHData, is.numeric)
numeric_data <- personHHData[, numeric_columns]
numeric_data <- numeric_data %>% select(-CTFIP, -hhid, -bg_density)
scaled_data <- scale(numeric_data)
hhid <- personHHData$hhid
bg_group <- as.factor(personHHData$bg_group)
scaled_data <- cbind(hhid, bg_group, scaled_data)
scaled_data_clean <- na.omit(scaled_data) %>%
as.data.frame()Exploratory Data Analysis
Before we partition our data and fit classification models, we want to get to know our data through some exploratory visualizations. Since our data is geographically structured, we found it helpful to visualize our data on a map.
Static Map
county_bg_aggreg <- personHHData %>%
group_by(County, CTFIP, bg_group) %>% # group by county, CTFIP, and also bg_group
mutate(count = n()) %>%
summarise_at(vars(-hhid, -pnum), mean)
county_bg_shp <- county_shp %>%
merge(data.frame(bg_group = c("Urban", "Suburban", "Exurban", "Rural"))) %>%
left_join(county_bg_aggreg)
# get the CA county data
county <- ggplot2::map_data("county", region = "california")
county_bg <- merge(county, data.frame(bg_group = c("Urban", "Suburban", "Exurban", "Rural")))
county_bg_all <- county_bg_aggreg %>%
mutate(subregion = tolower(County)) %>%
full_join(county_bg, by = c("subregion", "bg_group"))
ggplot(county_bg_all) +
geom_polygon(aes(x = long, y = lat, group = subregion, fill = Sum_PMT), colour = "white") +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
facet_wrap(vars(bg_group), nrow = 2) + # multi-panel plots using facet_wrap(), plot in 2 rows
ggtitle("Total PMT in California at County-level") +
theme_void() +
theme(legend.position="bottom")
Sum Trips by Residential Area
urban_TripMap <- mapview(filter(county_bg_shp, bg_group == "Urban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Urban Trips")
suburb_TripMap <- mapview(filter(county_bg_shp, bg_group == "Suburban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Suburban Trips")
exurb_TripMap <- mapview(filter(county_bg_shp, bg_group == "Exurban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Exurban Trips")
rural_TripMap <- mapview(filter(county_bg_shp, bg_group == "Rural"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Rural Trips")
latticeview(urban_TripMap, suburb_TripMap, exurb_TripMap, rural_TripMap, sync = "all")